library("dplyr")
library("ggplot2")
library("plotly")
## Warning: package 'plotly' was built under R version 4.4.2
library("zoo")
library("tidyr")
library(data.table)
cv_states <- fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv")
state_pops <- fread("https://raw.githubusercontent.com/COVID19Tracking/associated-data/master/us_census_data/us_census_2018_population_estimates_states.csv")
state_pops$abb <- state_pops$state
state_pops$state <- state_pops$state_name
state_pops$state_name <- NULL
cv_states <- merge(cv_states, state_pops, by = "state")
dim(cv_states)
## [1] 58094 9
head(cv_states)
## Key: <state>
## state date fips cases deaths geo_id population pop_density abb
## <char> <IDat> <int> <int> <int> <int> <int> <num> <char>
## 1: Alabama 2020-03-13 1 6 0 1 4887871 96.50939 AL
## 2: Alabama 2020-03-14 1 12 0 1 4887871 96.50939 AL
## 3: Alabama 2020-03-15 1 23 0 1 4887871 96.50939 AL
## 4: Alabama 2020-03-16 1 29 0 1 4887871 96.50939 AL
## 5: Alabama 2020-03-17 1 39 0 1 4887871 96.50939 AL
## 6: Alabama 2020-03-18 1 51 0 1 4887871 96.50939 AL
tail(cv_states)
## Key: <state>
## state date fips cases deaths geo_id population pop_density abb
## <char> <IDat> <int> <int> <int> <int> <int> <num> <char>
## 1: Wyoming 2023-03-18 56 185640 2009 56 577737 5.950611 WY
## 2: Wyoming 2023-03-19 56 185640 2009 56 577737 5.950611 WY
## 3: Wyoming 2023-03-20 56 185640 2009 56 577737 5.950611 WY
## 4: Wyoming 2023-03-21 56 185800 2014 56 577737 5.950611 WY
## 5: Wyoming 2023-03-22 56 185800 2014 56 577737 5.950611 WY
## 6: Wyoming 2023-03-23 56 185800 2014 56 577737 5.950611 WY
str(cv_states)
## Classes 'data.table' and 'data.frame': 58094 obs. of 9 variables:
## $ state : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ date : IDate, format: "2020-03-13" "2020-03-14" ...
## $ fips : int 1 1 1 1 1 1 1 1 1 1 ...
## $ cases : int 6 12 23 29 39 51 78 106 131 157 ...
## $ deaths : int 0 0 0 0 0 0 0 0 0 0 ...
## $ geo_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ population : int 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
## $ pop_density: num 96.5 96.5 96.5 96.5 96.5 ...
## $ abb : chr "AL" "AL" "AL" "AL" ...
## - attr(*, ".internal.selfref")=<externalptr>
## - attr(*, "sorted")= chr "state"
# Format the date
cv_states$date <- as.Date(cv_states$date, format = "%Y-%m-%d")
# Format the state and state abbreviation (abb) variables
state_list <- unique(cv_states$state)
cv_states$state <- factor(cv_states$state, levels = state_list)
abb_list <- unique(cv_states$abb)
cv_states$abb <- factor(cv_states$abb, levels = abb_list)
# Order the data first by state, second by date
cv_states <- cv_states[order(cv_states$state, cv_states$date), ]
# Confirm the variables are now correctly formatted
str(cv_states)
## Classes 'data.table' and 'data.frame': 58094 obs. of 9 variables:
## $ state : Factor w/ 52 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ date : Date, format: "2020-03-13" "2020-03-14" ...
## $ fips : int 1 1 1 1 1 1 1 1 1 1 ...
## $ cases : int 6 12 23 29 39 51 78 106 131 157 ...
## $ deaths : int 0 0 0 0 0 0 0 0 0 0 ...
## $ geo_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ population : int 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
## $ pop_density: num 96.5 96.5 96.5 96.5 96.5 ...
## $ abb : Factor w/ 52 levels "AL","AK","AZ",..: 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
head(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## <fctr> <Date> <int> <int> <int> <int> <int> <num> <fctr>
## 1: Alabama 2020-03-13 1 6 0 1 4887871 96.50939 AL
## 2: Alabama 2020-03-14 1 12 0 1 4887871 96.50939 AL
## 3: Alabama 2020-03-15 1 23 0 1 4887871 96.50939 AL
## 4: Alabama 2020-03-16 1 29 0 1 4887871 96.50939 AL
## 5: Alabama 2020-03-17 1 39 0 1 4887871 96.50939 AL
## 6: Alabama 2020-03-18 1 51 0 1 4887871 96.50939 AL
tail(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## <fctr> <Date> <int> <int> <int> <int> <int> <num> <fctr>
## 1: Wyoming 2023-03-18 56 185640 2009 56 577737 5.950611 WY
## 2: Wyoming 2023-03-19 56 185640 2009 56 577737 5.950611 WY
## 3: Wyoming 2023-03-20 56 185640 2009 56 577737 5.950611 WY
## 4: Wyoming 2023-03-21 56 185800 2014 56 577737 5.950611 WY
## 5: Wyoming 2023-03-22 56 185800 2014 56 577737 5.950611 WY
## 6: Wyoming 2023-03-23 56 185800 2014 56 577737 5.950611 WY
# Inspect the range values for each variable. What is the date range? The range of cases and deaths?
summary(cv_states)
## state date fips cases
## Washington : 1158 Min. :2020-01-21 Min. : 1.00 Min. : 1
## Illinois : 1155 1st Qu.:2020-12-06 1st Qu.:16.00 1st Qu.: 112125
## California : 1154 Median :2021-09-11 Median :29.00 Median : 418120
## Arizona : 1153 Mean :2021-09-10 Mean :29.78 Mean : 947941
## Massachusetts: 1147 3rd Qu.:2022-06-17 3rd Qu.:44.00 3rd Qu.: 1134318
## Wisconsin : 1143 Max. :2023-03-23 Max. :72.00 Max. :12169158
## (Other) :51184
## deaths geo_id population pop_density
## Min. : 0 Min. : 1.00 Min. : 577737 Min. : 1.292
## 1st Qu.: 1598 1st Qu.:16.00 1st Qu.: 1805832 1st Qu.: 43.659
## Median : 5901 Median :29.00 Median : 4468402 Median : 107.860
## Mean : 12553 Mean :29.78 Mean : 6397965 Mean : 423.031
## 3rd Qu.: 15952 3rd Qu.:44.00 3rd Qu.: 7535591 3rd Qu.: 229.511
## Max. :104277 Max. :72.00 Max. :39557045 Max. :11490.120
## NA's :1106
## abb
## WA : 1158
## IL : 1155
## CA : 1154
## AZ : 1153
## MA : 1147
## WI : 1143
## (Other):51184
min(cv_states$date)
## [1] "2020-01-21"
max(cv_states$date)
## [1] "2023-03-23"
# Add variables for new_cases and new_deaths:
for (i in 1:length(state_list)) {
cv_subset <- subset(cv_states, state == state_list[i])
cv_subset <- cv_subset[order(cv_subset$date), ]
# Add starting level for new cases and deaths
cv_subset$new_cases <- cv_subset$cases[1]
cv_subset$new_deaths <- cv_subset$deaths[1]
# Calculate new cases and new deaths
for (j in 2:nrow(cv_subset)) {
cv_subset$new_cases[j] <- cv_subset$cases[j] - cv_subset$cases[j - 1]
cv_subset$new_deaths[j] <- cv_subset$deaths[j] - cv_subset$deaths[j - 1]
}
# Include in main dataset
cv_states$new_cases[cv_states$state == state_list[i]] <- cv_subset$new_cases
cv_states$new_deaths[cv_states$state == state_list[i]] <- cv_subset$new_deaths
}
# Add population-normalized (by 100,000) counts for each variable
cv_states$per100k = as.numeric(format(round(cv_states$cases / (cv_states$population / 100000), 1), nsmall = 1))
cv_states$newper100k = as.numeric(format(round(cv_states$new_cases / (cv_states$population / 100000), 1), nsmall = 1))
cv_states$deathsper100k = as.numeric(format(round(cv_states$deaths / (cv_states$population / 100000), 1), nsmall = 1))
cv_states$newdeathsper100k = as.numeric(format(round(cv_states$new_deaths / (cv_states$population / 100000), 1), nsmall = 1))
# Add a naive_CFR variable = deaths / cases
cv_states = cv_states %>%
dplyr::mutate(naive_CFR = round((deaths * 100 / cases), 2))
# Create a `cv_states_today` variable
cv_states_today = subset(cv_states, date == max(cv_states$date))
### FINISH CODE HERE ###
# pop_density vs. cases
cv_states_today %>%
plot_ly(x = ~pop_density, y = ~cases,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode = 'diameter', opacity = 0.5))
## Warning: Ignoring 1 observations
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
# Filter out "District of Columbia"
cv_states_today_filter <- cv_states_today %>% filter(state != "District of Columbia")
# pop_density vs. cases after filtering
cv_states_today_filter %>%
plot_ly(x = ~pop_density, y = ~cases,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode = 'diameter', opacity = 0.5))
## Warning: Ignoring 1 observations
## Warning: n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning: n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
# pop_density vs. deathsper100k
cv_states_today_filter %>%
plot_ly(x = ~pop_density, y = ~deathsper100k,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode = 'diameter', opacity = 0.5))
## Warning: Ignoring 1 observations
## Warning: n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning: n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
# Adding hoverinfo
cv_states_today_filter %>%
plot_ly(x = ~pop_density, y = ~deathsper100k,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode = 'diameter', opacity = 0.5),
hoverinfo = 'text',
text = ~paste(paste(state, ":", sep = ""),
paste(" Cases per 100k: ", per100k, sep = ""),
paste(" Deaths per 100k: ", deathsper100k, sep = ""), sep = "<br>")) %>%
layout(title = "Population-normalized COVID-19 deaths (per 100k) vs. population density for US states",
yaxis = list(title = "Deaths per 100k"), xaxis = list(title = "Population Density"),
hovermode = "compare")
## Warning: Ignoring 1 observations
## Warning: n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning: n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
### FINISH CODE HERE ###
# Create a scatterplot with ggplot for pop_density vs. deathsper100k
p <- ggplot(cv_states_today_filter, aes(x = pop_density, y = deathsper100k, size = population)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = TRUE, color = "blue") +
labs(
title = "Relationship Between Population Density and Deaths per 100k",
x = "Population Density",
y = "Deaths per 100k"
)
ggplotly(p)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: The following aesthetics were dropped during statistical transformation: size.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
It appears that as population density increases, deaths per 100k increases. It would appear that it population density is a correlate of newdeathsperweek100k.
### FINISH CODE HERE ###
# Line chart for naive_CFR for all states over time using `plot_ly()`
plot_ly(cv_states, x = ~date, y = ~naive_CFR, color = ~state,
type = "scatter", mode = "lines",
colors = RColorBrewer::brewer.pal(12, "Set3"))
# Line chart for Florida showing new_cases and new_deaths together
cv_states %>%
filter(state == "Florida") %>%
plot_ly(x = ~date, y = ~new_cases, type = "scatter", mode = "lines") %>%
add_trace(x = ~date, y = ~new_deaths, type = "scatter", mode = "lines")
cv_states %>%
filter(state == "Florida") %>%
plot_ly(x = ~date, y = ~new_cases, type = "scatter", mode = "lines", name = "New Cases",
line = list(color = "blue")) %>%
add_trace(y = ~new_deaths, name = "New Deaths", line = list(color = "red")) %>%
layout(
title = "New Cases and Deaths in Florida Over Time",
xaxis = list(title = "Date"),
yaxis = list(title = "Count"),
legend = list(title = list(text = "Metric"))
)
It seems that there is about a 2 month delay between new cases spiking and deaths spiking.
### FINISH CODE HERE ###
### Map state, date, and new_cases to a matrix
cv_states_mat <- cv_states %>%
select(state, date, new_cases) %>%
dplyr::filter(date > as.Date("2021-06-15"))
cv_states_mat2 <- as.data.frame(
pivot_wider(cv_states_mat, names_from = state, values_from = new_cases, values_fill = 0)
)
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)
# Replace any remaining NA values with 0
cv_states_mat2[is.na(cv_states_mat2)] <- 0
# Verify the range of the data
range(cv_states_mat2, na.rm = TRUE)
## [1] -18255 227972
# Create the heatmap
plot_ly(x = colnames(cv_states_mat2),
y = rownames(cv_states_mat2),
z = ~cv_states_mat2,
type = "heatmap",
showscale = TRUE)
filter_dates <- seq(as.Date("2021-06-15"), as.Date("2021-11-01"), by = "2 weeks")
cv_states_mat <- cv_states %>%
select(state, date, new_cases) %>%
filter(date %in% filter_dates)
cv_states_mat2 <- as.data.frame(
pivot_wider(cv_states_mat, names_from = state, values_from = new_cases, values_fill = 0)
)
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)
plot_ly(x = colnames(cv_states_mat2),
y = rownames(cv_states_mat2),
z = ~cv_states_mat2,
type = "heatmap",
showscale = TRUE)
### FINISH CODE HERE ###
pick.date <- "2021-10-15"
# Extract the data for each state by its abbreviation
cv_per100 <- cv_states %>%
filter(date == pick.date) %>%
select(state, abb, newper100k, cases, deaths) # select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL
# Create hover text
cv_per100$hover <- with(cv_per100, paste(state_name, '<br>',
"Cases per 100k: ", newper100k, '<br>',
"Cases: ", cases, '<br>',
"Deaths: ", deaths))
# Set up mapping details
set_map_details <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
# Make sure both maps are on the same color scale
shadeLimit <- 125
# Create the map
fig_pick.date <- plot_geo(cv_per100, locationmode = 'USA-states') %>%
add_trace(
z = ~newper100k, text = ~hover, locations = ~state,
color = ~newper100k, colors = 'Purples'
) %>%
colorbar(title = paste0("Cases per 100k: ", pick.date), limits = c(0, shadeLimit)) %>%
layout(
title = paste('Cases per 100k by State as of ', pick.date, '<br>(Hover for value)'),
geo = set_map_details
)
### Map for today's date
cv_per100 <- cv_states_today %>%
select(state, abb, newper100k, cases, deaths) # select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL
# Create hover text
cv_per100$hover <- with(cv_per100, paste(state_name, '<br>',
"Cases per 100k: ", newper100k, '<br>',
"Cases: ", cases, '<br>',
"Deaths: ", deaths))
# Create the map
fig_Today <- plot_geo(cv_per100, locationmode = 'USA-states') %>%
add_trace(
z = ~newper100k, text = ~hover, locations = ~state,
color = ~newper100k, colors = 'Purples'
) %>%
colorbar(title = paste0("Cases per 100k: ", Sys.Date()), limits = c(0, shadeLimit)) %>%
layout(
title = paste('Cases per 100k by State as of', Sys.Date(), '<br>(Hover for value)'),
geo = set_map_details
)
### Plot together
subplot(fig_pick.date, fig_Today, nrows = 2, margin = 0.05)
So the naive case fatality rate is a lot lower in most states in 2024. California seemed to increase as well as Ohio and Illinois. Overall, the US is shaded more lightly indicating lower case fatality rates.